from lxml import etree

from Spider4Mirror.model.maven_info import MavenInfo
from Spider4Mirror.spider.base_spider import BaseSpider

MAVEN_MIRROR_BASE_URL = "https://repo1.maven.org/maven2/"
COMMON_XPATH = "//a/text()"


class MavenMirrorSpider(BaseSpider):

    def __init__(self, limit=10):
        super().__init__()
        self.limit = limit
        self.count = 0
        self.objs = list()

    def find_all_node(self, this_level_url, next_level_text):
        """
        查询页面所有节点
        :param this_level_url:
        :param next_level_text:
        :return: 节点文本
        """
        url = self.concatenate_next_level_url(this_level_url, next_level_text)
        res = self.get_url(url)
        select = etree.HTML(res.text)
        return select.xpath(COMMON_XPATH)

    @classmethod
    def concatenate_next_level_url(cls, this_level_url, next_level_text):
        """
        拼接下一层链接
        :param this_level_url:
        :param next_level_text:
        :return: next_level_url 节点下层链接
        """
        return "{0}{1}".format(this_level_url, next_level_text)

    def artifact_id_level_loop(self, group_id):
        """
        artifact_id层循环
        :param group_id:
        """
        second_url = self.concatenate_next_level_url(MAVEN_MIRROR_BASE_URL, group_id)
        artifact_ids = self.find_all_node(MAVEN_MIRROR_BASE_URL, group_id)[1:]
        for artifact_id in artifact_ids:
            if self.count == self.limit:
                self.log.info("已达到限制数量")
                break
            self.version_level_loop(second_url, group_id, artifact_id)

    @classmethod
    def create_info_obj(cls, group_id, artifact_id, version):
        """
        创建写入模型对象
        :param group_id:
        :param artifact_id:
        :param version:
        :return: obj模型对象
        """
        obj = MavenInfo()
        obj.group_id = group_id[:-1]
        obj.artifact_id = artifact_id[:-1]
        obj.version = version[:-1]
        return obj

    def version_level_loop(self, second_url, group_id, artifact_id):
        """
        version层循环
        :param second_url:二层链接
        :param group_id:
        :param artifact_id:
        """
        versions = self.find_all_node(second_url, artifact_id)[1:]
        for version in versions:
            if self.count == self.limit:
                self.log.info("已达到限制数量")
                break
            query_result = self.mysql_util.query_one(
                MavenInfo, [MavenInfo.group_id == group_id[:-1],
                            MavenInfo.artifact_id == artifact_id[:-1],
                            MavenInfo.version == version[:-1]], [])
            if query_result or not version.endswith("/"):
                self.log.info("版本已存在或不是版本：{0}{1}{2}".format(group_id, artifact_id, version))
                continue
            obj = self.create_info_obj(group_id, artifact_id, version)
            self.objs.append(obj)
            self.count += 1

    def run(self):
        """运行入口"""
        if not self.mysql_util.judge_table_is_exist(MavenInfo.__tablename__):
            self.log.info("表不存在，创建表")
            MavenInfo.__table__.create(self.mysql_util.engine)
        group_ids = self.find_all_node(MAVEN_MIRROR_BASE_URL, "")[1:]
        for group_id in group_ids:
            if self.count == self.limit:
                self.log.info("已达到限制数量")
                break
            self.artifact_id_level_loop(group_id)
        self.log.info("插入新增信息")
        self.mysql_util.insert_many(self.objs)


if __name__ == '__main__':
    import sys

    args = sys.argv
    if len(args) == 2:
        limits = int(args[1])
        mms = MavenMirrorSpider(limits)
    else:
        mms = MavenMirrorSpider()
    mms.run()
